# Load all packages here, consider to mute output of this code chunk
# https://rmarkdown.rstudio.com/lesson-3.html
require(RPostgreSQL)
require(getPass)
library(tidyverse)
library(readr)
library(dplyr)
library(sf) # <- optional
library(plotly) # <- optional
library(geosphere)Report 2.1: Hobo measurement compaign - quality control
Environmental Monitoring, Data Analysis and Visualization
Setup
Loaded packages
Preprocessing / R functions
#loading in the hobo data
hobo_data <- read_csv('https://raw.githubusercontent.com/data-hydenv/data/refs/heads/master/hobo/2025/raw/10350042.csv', skip = 1, col_types = "icnnccccc")
#clean up the data
#renaming columns
colnames(hobo_data)[2] = 'datetime'
colnames(hobo_data)[3] = 'temp'
colnames(hobo_data)[4] = 'illum'
#convert date time to actual datetime column
hobo_data$datetime <- strptime(hobo_data$datetime, format = "%m/ %d/ %Y %I:%M:%S %p")
head(hobo_data)1. Hobo meta informations
The Hobo logger was placed inside a plastic planter and secured with wire. It was placed next to a temperature sensor on the main tower (mixed plot) with a height of 47 m and very high influence. It is possible that the plastic planter does not provide enough shielding from the radiation, potentially causing readings to be higher than the actual temperature during day time. Also, the plastic material may trap heat during warm and/or sunny periods.
2. Consistent HOBO data file
#reading calibration data
t_cal <- read.csv('https://raw.githubusercontent.com/data-hydenv/data/refs/heads/master/hobo/2025/calibration.csv', skip = 3)
#convert to datetime column
t_cal$datetime <- strptime(t_cal$dttm, format = "%Y-%m-%d %H:%M")
head(t_cal)2.1. Calibration
Calibration of raw measurements against temperatures recorded by HOBO when in a bucket of hot water and cold water. The calibration value was measured with the GMH3750 Thermometer.
#merging hobo and calibration data
merged_data <- merge(t_cal, select(hobo_data, datetime, temp, illum,), by = "datetime")
#remove the first 20 mins
merged_data <- merged_data %>%
group_by(bucket) %>%
slice(3:n())
#create a linear relationship b/w measured and calibration temperature
fm = glm(T ~ temp, data = merged_data)
summary(fm)
Call:
glm(formula = T ~ temp, data = merged_data)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 63.04572 0.79291 79.51 0.00801 **
temp -1.07015 0.02639 -40.56 0.01569 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 0.2414812)
Null deviance: 397.44747 on 2 degrees of freedom
Residual deviance: 0.24148 on 1 degrees of freedom
AIC: 6.9549
Number of Fisher Scoring iterations: 2
#apply relationship to the hobo measurements
hobo_data$temp_cal <- predict(fm, newdata = data.frame("temp" = hobo_data$temp))2.2. Create HOBO data file
#renaming columns
colnames(hobo_data)[1] <- "id"
colnames(hobo_data)[2] <- "dttm"
#calibration temperature columns
hobo_clean <- select(hobo_data, id, dttm, temp)
colnames(hobo_clean)[3] <- "temp"
#specifying the time periods and date temp transformations
hobo_clean <- hobo_clean %>%
filter(dttm >= '2024-10-31 00:00' & dttm <= '2024-12-15 23:50') %>%
transform(dttm = as.character(format(dttm, "%Y-%m-%d %H:%M")), temp = trimws(format(round(temp, 3), nsmall = 3))) %>%
mutate(id = 1:n()) %>%
write_csv(., file = "./data/output/10350042.csv")2.3. Verify your file
# Read the file back into R
output_data <- read_csv("D:/EMDAV/report_template/data/output/10350042.csv", show_col_types = FALSE)
head(output_data)summary(output_data) id dttm temp
Min. : 1 Min. :2024-10-31 00:00:00 Min. :-4.031
1st Qu.:1657 1st Qu.:2024-11-11 11:57:30 1st Qu.: 1.980
Median :3312 Median :2024-11-22 23:55:00 Median : 4.727
Mean :3312 Mean :2024-11-22 23:55:00 Mean : 4.963
3rd Qu.:4968 3rd Qu.:2024-12-04 11:52:30 3rd Qu.: 7.079
Max. :6624 Max. :2024-12-15 23:50:00 Max. :18.996
3. Quality control
#|
# Load data from Github (then eval = TRUE)
data <- read_csv("https://raw.githubusercontent.com/mishlim/data/refs/heads/master/hobo/2025/10_minutes/10350042.csv", show_col_types = FALSE)
head(data)nrow(data)[1] 6624
3.1. First impression
The temperature fluctuates between -5 and 20 degree celsius approximately. We can observe trends and patterns in early November with peaks and troughs. The overall temperature noticeably declines in December. In early December an increase in temperatures can be observed with several spikes reaching close to 20 degree C. The uncharacteristic warm period in November may indicate some warming event or other factors that caused it. But the other values seem like typical temperature measurements of a winter in Southern Germany.
#plotting the uploaded data as a line plot
plot(
data$dttm,
data$temp,
type='l',
xlab='Time (10 min intervals)',
ylab='Measured Temperature (°C)',
main = 'Measured Temperature',
ylim = c(-5,20)
)3.2. Quality control procedures (QCPs)
3.2.1. Measurement range (Plausible values)
# Define valid temperature range for our hobo model
min_valid_temp <- -20
max_valid_temp <- 70
# Check for out-of-range values with the temperature range
qc1 <- data %>%
filter(temp < min_valid_temp | temp > max_valid_temp)
if (nrow(qc1) == 0) {
cat("All temperature values are within the valid range.\n")
} else {
cat("Out-of-range temperature values found:\n")
print(qc1)
}All temperature values are within the valid range.
3.2.2. Plausible rate of change
# Add a column to compute the temperature difference between consecutive rows
qc2 <- data %>%
mutate(temp_diff = abs(temp - lag(temp)))
# Filter points where the difference exceeds 1 K
qc2_flags <- qc2 %>%
filter(temp_diff > 1)
# Check for flags and print results
if (nrow(qc2_flags) == 0) {
cat("All temperature changes are within 1 K compared to the previous point.\n")
} else {
cat("Temperature changes greater than 1 K found:\n")
print(qc2_flags)
cat("\nNumber of QC2 flags:", nrow(qc2_flags), "\n")
}Temperature changes greater than 1 K found:
# A tibble: 135 × 4
id dttm temp temp_diff
<dbl> <dttm> <dbl> <dbl>
1 225 2024-11-01 13:20:00 12.1 1.07
2 229 2024-11-01 14:00:00 13.8 1.74
3 272 2024-11-01 21:10:00 8.08 1.78
4 278 2024-11-01 22:10:00 9.18 1.29
5 282 2024-11-01 22:50:00 8.58 1.28
6 283 2024-11-01 23:00:00 7.48 1.1
7 497 2024-11-03 10:40:00 9.96 1.28
8 499 2024-11-03 11:00:00 11.2 1.47
9 501 2024-11-03 11:20:00 12.4 1.36
10 505 2024-11-03 12:00:00 14.7 2.60
# ℹ 125 more rows
Number of QC2 flags: 135
# View the flagged data points
View(qc2_flags)3.2.3. Minimum variability (Persistence)
qc3 <- data %>%
mutate(
# employ a Rolling check for constant temperatures
unchanged_60_min = (temp == lag(temp, 1) &
temp == lag(temp, 2) &
temp == lag(temp, 3) &
temp == lag(temp, 4) &
temp == lag(temp, 5))
) %>%
filter(unchanged_60_min)
# Print the data points that failed the data checks
if (nrow(qc3) == 0) {
cat("No data points failed the 60-minute constant temperature QCP.\n")
} else {
cat("Data points with 60-minute constant temperature failures:\n")
print(qc3)
cat("\nNumber of flags:", nrow(qc3), "\n")
}Data points with 60-minute constant temperature failures:
# A tibble: 269 × 4
id dttm temp unchanged_60_min
<dbl> <dttm> <dbl> <lgl>
1 6 2024-10-31 00:50:00 8.48 TRUE
2 12 2024-10-31 01:50:00 8.58 TRUE
3 33 2024-10-31 05:20:00 8.48 TRUE
4 43 2024-10-31 07:00:00 8.08 TRUE
5 51 2024-10-31 08:20:00 8.58 TRUE
6 117 2024-10-31 19:20:00 8.48 TRUE
7 118 2024-10-31 19:30:00 8.48 TRUE
8 119 2024-10-31 19:40:00 8.48 TRUE
9 120 2024-10-31 19:50:00 8.48 TRUE
10 121 2024-10-31 20:00:00 8.48 TRUE
# ℹ 259 more rows
Number of flags: 269
3.3. Summarize
QC 1] Check each temperature data point to be in the measurement range - All recorded temperatures were within the specified valid range of -20°C to 70°C. No data points were flagged as outliers due to unrealistic or extreme temperature values.
QC 2] Check each temperature data point to have not more than 1 K (1 Kelvin) temperature change compared to the previous data point - 2.04% flags, several data points showed sudden temperature changes greater than 1K. On 2024-11-01 at 14:00, a jump of 1.737 K was observed. On 2024-11-03, sharp increase of 2.599 was observed. It could be due to the rapid heating/cooling of the plastic planter, or sensor interference due to external factors such as condensation or other physical disturbances.
QC 3] If temperature has not changed during the last 60 minutes (i.e. data point Ti plus 5 data points before, so from Ti−1 to Ti−5) the corresponding data point Ti failed in this QCP - 4% of the data showed flags. This could be due to sensor malfunction, as the temperature remained the same for 6 consecutive measurements. Could also be due to data logging issues in transmitting or recording data.
# Define valid temperature range
min_valid_temp <- -20
max_valid_temp <- 70
# Check for out-of-range values (qc1)
data <- data %>%
mutate(qc1_flag = temp < min_valid_temp | temp > max_valid_temp)
# Add the temperature difference column for qc2 (change in temperature)
data <- data %>%
mutate(temp_diff = abs(temp - lag(temp))) # Compute temperature difference
# Filter points where the difference exceeds 1 K for qc2 (large temperature change)
data <- data %>%
mutate(qc2_flag = temp_diff > 1)
# Check for 60-minute constant temperature period (qc3)
data <- data %>%
mutate(
unchanged_60_min = (temp == lag(temp, 1) &
temp == lag(temp, 2) &
temp == lag(temp, 3) &
temp == lag(temp, 4) &
temp == lag(temp, 5)),
qc3_flag = unchanged_60_min
)
# Consolidate the quality control flags into one qc_all column
data <- data %>%
mutate(
qc_all = qc1_flag | qc2_flag | qc3_flag # Flag if any QCP fails
)
# View the consolidated quality control flags
View(data)
# Create the final qc_df tibble with the necessary columns
qc_df <- data %>%
select(dttm, temp, qc1_flag, qc2_flag, qc3_flag, qc_all)
# Print the first few rows of qc_df to check the results
head(qc_df)# Count the number of failures for each QCP
qc_counts <- qc_df %>%
summarise(
qc1_fail = sum(qc1_flag, na.rm = TRUE),
qc2_fail = sum(qc2_flag, na.rm = TRUE),
qc3_fail = sum(qc3_flag, na.rm = TRUE),
qc_all_fail = sum(qc_all, na.rm = TRUE)
)
head(qc_counts)# transform the data for plotting
qc_counts_long <- qc_counts %>%
gather(key = "qc_type", value = "failures")
# Create a bar plot
ggplot(qc_counts_long, aes(x = qc_type, y = failures, fill = qc_type)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(
title = "Quality Control Check Failures",
x = "Quality Control Check",
y = "Number of Failures"
) +
theme_minimal()3.4. Aggregate
qc_df <- qc_df %>%
mutate(hour = floor_date(dttm, "hour"))
# Step 2: Aggregate by hour
hobo_hourly <- qc_df %>%
group_by(hour) %>%
summarize(
flag_count = sum(qc_all, na.rm = TRUE),
# Aggregate temperature for the hour, set to NA if two or more flags are found
th = ifelse(flag_count <= 1, round(mean(temp, na.rm = TRUE), 4), NA_real_)
) %>%
select(date_time = hour, th)
# Step 3: Set the time zone to UTC+1 (if necessary)
hobo_hourly <- hobo_hourly %>%
mutate(date_time = force_tz(date_time, tzone = "CET"))
# Step 4: Verify the output
print(head(hobo_hourly))# A tibble: 6 × 2
date_time th
<dttm> <dbl>
1 2024-10-31 00:00:00 8.48
2 2024-10-31 01:00:00 8.58
3 2024-10-31 02:00:00 8.53
4 2024-10-31 03:00:00 8.53
5 2024-10-31 04:00:00 8.43
6 2024-10-31 05:00:00 8.35
write.csv(hobo_hourly, "hobo_hourly.csv", row.names = FALSE)
View(hobo_hourly)4. Fill-up with reference station
In order to fill the NA values in the above data, the data from the reference station was selected and used. Then, a linear model was applied to fill in the missing data. As HOBO 10350042 was at a height of 47 m, the closest tower is the Main tower and the tower selected was the 47 m Ecosense Main Tower (reference/2025/AirTemp.+47m@Ecosense_MainTower.20241020.csv)
4.1. Find reference station
# Read the shapefile
tow_loc <- read_sf("D:/EMDAV/02_infrastructure/tower.shp")
# Transform to WGS 84 (EPSG:4326)
tow_loc <- st_transform(tow_loc, crs = 4326)
# Extract coordinates from the geometry column
tow_coords <- st_coordinates(tow_loc)
# Save each tower's coordinates
tow_M <- c(tow_coords[1, "X"], tow_coords[1, "Y"])
tow_D <- c(tow_coords[2, "X"], tow_coords[2, "Y"])
tow_B <- c(tow_coords[3, "X"], tow_coords[3, "Y"])
# HOBO logger location
hobo_LAT <- 48.268527
hobo_LONG <- 7.8782235
loc_hobo <- c(hobo_LONG, hobo_LAT)
# Calculate distances using distHaversine
distance_M <- distHaversine(loc_hobo, tow_M)
distance_D <- distHaversine(loc_hobo, tow_D)
distance_B <- distHaversine(loc_hobo, tow_B)
# Print results
print(paste("Distance to tower M:", distance_M, "m"))[1] "Distance to tower M: 0.524219294862582 m"
print(paste("Distance to tower D:", distance_D, "m"))[1] "Distance to tower D: 79.5067696039583 m"
print(paste("Distance to tower B:", distance_B, "m"))[1] "Distance to tower B: 116.172565637209 m"
# Identify the minimum distance
min_distance <- min(distance_M, distance_D, distance_B)
closest_tower <- ifelse(min_distance == distance_M, "Main Tower (M)",
ifelse(min_distance == distance_D, "Douglas Fir Tower (D)", "Beech Tower (B)"))
print(paste("The closest tower is:", closest_tower, "with a distance of", min_distance, "m"))[1] "The closest tower is: Main Tower (M) with a distance of 0.524219294862582 m"
print(paste("Distance to tower D:", distance_D, "m"))[1] "Distance to tower D: 79.5067696039583 m"
print(paste("Distance to tower B:", distance_B, "m"))[1] "Distance to tower B: 116.172565637209 m"
4.2. Fill-up
beech_data <- read.csv("https://raw.githubusercontent.com/data-hydenv/data/refs/heads/master/reference/2025/AirTemp.%2B47m%40Ecosense_MainTower.20241020.csv", skip = 14)
beech_data <- beech_data %>%
mutate(
dttm = strptime(
Timestamp..UTC.00.00.,
format = "%Y-%m-%d %H",
tz = "UTC"
)
) %>%
mutate(date_time = with_tz(dttm, tzone = "Etc/GMT-1")) %>%
group_by(date_time) %>%
summarize(beech_th = mean(Value, na.rm = TRUE)) %>%
ungroup() %>%
select(date_time, beech_th) %>%
filter(date_time >= '2024-10-31 00:00' & date_time <= '2024-12-15 23:00')
# merge the reference data with the hobo data
hobo_beech <- left_join(hobo_hourly, beech_data, by = "date_time")
# rename columns
colnames(hobo_beech)[2] = 'th'
head(hobo_beech)# Create a new column to flag missing values before filling
hobo_beech <- hobo_beech %>%
mutate(origin = if_else(is.na(th), 'R', 'H')) # Flag NA as 'R', others as 'H'# Linear regression between HOBO data and Beech Tower data
fmodel <- lm(th ~ beech_th, data = hobo_beech)
# Fill missing 'th' values using model predictions
hobo_beech$th[is.na(hobo_beech$th)] <- predict(fmodel, newdata = hobo_beech[is.na(hobo_beech$th), ])
# Calculate R-squared
rsq <- summary(fmodel)$r.squared
print(paste0('The model\'s R-squared = ', round(rsq, 3)))[1] "The model's R-squared = 0.97"
missing_data_count <- sum(is.na(hobo_beech$th), na.rm = TRUE)
print(paste0('Number of missing data left: ', missing_data_count))[1] "Number of missing data left: 0"
# Ensure no missing data remains
if (missing_data_count == 0) {
cat("No missing data left.\n")
} else {
cat("Some missing data still remains.\n")
}No missing data left.
ggplot(hobo_beech) +
geom_line(aes(x = date_time, y = th), color = "blue") + # Line for filled data
geom_point(aes(x = date_time, y = th, color = is.na(th)), size = 3, shape = 1) + # Points for missing data
labs(
title = "Time Series of Temperature with Missing Data Points Highlighted",
x = "Datetime",
y = "Temperature"
) +
scale_color_manual(values = c("TRUE" = "red", "FALSE" = "blue"), labels = c("Missing Data", "Filled Data")) +
theme_minimal() +
theme(legend.title = element_blank())library(gridExtra)
plot_original <- ggplot(hobo_beech) +
geom_line(aes(x = date_time, y = th), color = "red") +
labs(title = "Original Data (With Missing Values)")
plot_filled <- ggplot(hobo_beech) +
geom_line(aes(x = date_time, y = th), color = "blue") +
labs(title = "Data After Filling Missing Values")
grid.arrange(plot_original, plot_filled, nrow = 1)4.3. upload data
final_data <- hobo_beech %>%
mutate(dttm = format(date_time, "%Y-%m-%d %H"), # Create dttm in the correct format
th = round(th, 3)) %>% # Round temperature values
select(dttm, th, origin) # Select necessary columns
# Save the final dataset to a CSV file
write.csv(final_data, "10350042_Th.csv", row.names = FALSE)
# View the final dataset
View(final_data)# Read the CSV file
data_count <- read.csv("https://raw.githubusercontent.com/data-hydenv/data/refs/heads/master/hobo/2025/hourly/10350042_Th.csv")
na_count <- sum(is.na(data_count))
if (na_count == 0) {
print("No NA values found in the CSV file.")
} else {
print(paste("There are", na_count, "NA values in the CSV file."))
}[1] "No NA values found in the CSV file."
# Count the total number of rows
total_count <- nrow(data_count)
# Count the occurrences of each origin
count_origin <- table(data_count$origin)
# Calculate percentages
percent_H <- (count_origin["H"] / total_count) * 100
percent_R <- (count_origin["R"] / total_count) * 100
# Print the results
print(paste("Percentage of H:", round(percent_H, 2), "%"))[1] "Percentage of H: 91.21 %"
print(paste("Percentage of R:", round(percent_R, 2), "%"))[1] "Percentage of R: 8.79 %"
8.79% of the data was filled using Regression Analysis. These imputations should be taken into consideration when data is used for further analysis.